/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.searcher; import java.net.InetSocketAddress; import java.io.*; import java.util.*; import java.util.logging.Logger; import net.nutch.util.LogFormatter; import net.nutch.io.*; import net.nutch.ipc.*; /** Implements the search API over IPC connnections. */ public class DistributedSearch { public static final Logger LOG = LogFormatter.getLogger("net.nutch.searcher.DistributedSearch"); private DistributedSearch() {} // no public ctor // op codes for IPC calls private static final byte OP_SEGMENTS = (byte)0; private static final byte OP_SEARCH = (byte)1; private static final byte OP_EXPLAIN = (byte)2; private static final byte OP_DETAILS = (byte)3; private static final byte OP_SUMMARY = (byte)4; private static final byte OP_CONTENT = (byte)5; private static final byte OP_ANCHORS = (byte)6; /** Names of the op codes. */ private static final String[] OP_NAMES = new String[7]; static { OP_NAMES[OP_SEGMENTS] = "getSegmentNames"; OP_NAMES[OP_SEARCH] = "search"; OP_NAMES[OP_EXPLAIN] = "getExplanation"; OP_NAMES[OP_DETAILS] = "getDetails"; OP_NAMES[OP_SUMMARY] = "getSummary"; OP_NAMES[OP_CONTENT] = "getContent"; OP_NAMES[OP_ANCHORS] = "getAnchors"; } /** The parameter passed with IPC requests. Public only so that {@link * Server} can construct instances. */ public static class Param implements Writable { private byte op; // the op code private Writable first; // the first operand private Writable second; // the second operand public Param() {} Param(byte op, Writable first) { this(op, first, NullWritable.get()); } Param(byte op, Writable first, Writable second) { this.op = op; this.first = first; this.second = second; } public void write(DataOutput out) throws IOException { out.writeByte(op); first.write(out); second.write(out); } public void readFields(DataInput in) throws IOException { op = in.readByte(); switch (op) { case OP_SEGMENTS: first = NullWritable.get(); second = NullWritable.get(); break; case OP_SEARCH: first = new Query(); second = new IntWritable(); break; case OP_EXPLAIN: first = new Query(); second = new Hit(); break; case OP_DETAILS: first = new Hit(); second = NullWritable.get(); break; case OP_SUMMARY: first = new HitDetails(); second = new Query(); break; case OP_CONTENT: case OP_ANCHORS: first = new HitDetails(); second = NullWritable.get(); break; default: throw new RuntimeException("Unknown op code: " + op); } first.readFields(in); second.readFields(in); } } /** The parameter returned with IPC responses. Public only so that {@link * Client} can construct instances. */ public static class Result implements Writable { private byte op; private Writable value; public Result() {} Result(byte op, Writable value) { this.op = op; this.value = value; } public void write(DataOutput out) throws IOException { out.writeByte(op); value.write(out); } public void readFields(DataInput in) throws IOException { op = in.readByte(); switch (op) { case OP_SEGMENTS: value = new ArrayWritable(UTF8.class); break; case OP_SEARCH: value = new Hits(); break; case OP_EXPLAIN: value = new UTF8(); break; case OP_DETAILS: value = new HitDetails(); break; case OP_SUMMARY: value = new UTF8(); break; case OP_CONTENT: value = new BytesWritable(); break; case OP_ANCHORS: value = new ArrayWritable(UTF8.class); break; default: throw new RuntimeException("Unknown op code: " + op); } value.readFields(in); } } /** The search server. */ public static class Server extends net.nutch.ipc.Server { private NutchBean bean; /** Construct a search server on the index and segments in the named * directory, listening on the named port. */ public Server(File directory, int port) throws IOException { super(port, Param.class, 10); this.bean = new NutchBean(directory); } public Writable call(Writable param) throws IOException { Param p = (Param)param; logRequest(p); Writable value; switch (p.op) { case OP_SEGMENTS: value = new ArrayWritable(bean.getSegmentNames()); break; case OP_SEARCH: value = bean.search((Query)p.first, ((IntWritable)p.second).get()); break; case OP_EXPLAIN: value = new UTF8(bean.getExplanation((Query)p.first, (Hit)p.second)); break; case OP_DETAILS: value = bean.getDetails((Hit)p.first); break; case OP_SUMMARY: value = new UTF8(bean.getSummary((HitDetails)p.first,(Query)p.second)); break; case OP_CONTENT: value = new BytesWritable(bean.getContent((HitDetails)p.first)); break; case OP_ANCHORS: value = new ArrayWritable(bean.getAnchors((HitDetails)p.first)); break; default: throw new RuntimeException("Unknown op code: " + p.op); } //LOG.info("Result: "+value); return new Result(p.op, value); } private static void logRequest(Param p) { StringBuffer buffer = new StringBuffer(); buffer.append(Thread.currentThread().getName()); buffer.append(": "); buffer.append(OP_NAMES[p.op]); buffer.append("("); if (p.first != NullWritable.get()) { buffer.append(p.first); if (p.second != NullWritable.get()) { buffer.append(", "); buffer.append(p.second); } } buffer.append(")"); DistributedSearch.LOG.info(buffer.toString()); } /** Runs a search server. */ public static void main(String[] args) throws Exception { String usage = "DistributedSearch$Server <port> <index dir>"; if (args.length == 0 || args.length > 2) { System.err.println(usage); System.exit(-1); } int port = Integer.parseInt(args[0]); File directory = new File(args[1]); Server server = new Server(directory, port); //server.setTimeout(Integer.MAX_VALUE); server.start(); server.join(); } } /** The search client. */ public static class Client extends net.nutch.ipc.Client implements Searcher, HitDetailer, HitSummarizer, HitContent { private InetSocketAddress[] addresses; private HashMap segmentToAddress = new HashMap(); /** Construct a client talking to servers listed in the named file. * Each line in the file lists a server hostname and port, separated by * whitespace. */ public Client(File file) throws IOException { this(readConfig(file)); } private static InetSocketAddress[] readConfig(File config) throws IOException { BufferedReader reader = new BufferedReader(new FileReader(config)); ArrayList addrs = new ArrayList(); String line; while ((line = reader.readLine()) != null) { StringTokenizer tokens = new StringTokenizer(line); if (tokens.hasMoreTokens()) { String host = tokens.nextToken(); if (tokens.hasMoreTokens()) { String port = tokens.nextToken(); addrs.add(new InetSocketAddress(host, Integer.parseInt(port))); DistributedSearch.LOG.info("Client adding server " + host + ":" + port); } } } return (InetSocketAddress[]) addrs.toArray(new InetSocketAddress[addrs.size()]); } /** Construct a client talking to the named servers. */ public Client(InetSocketAddress[] addresses) throws IOException { super(Result.class); this.addresses = addresses; // build segmentToAddress map Param param = new Param(OP_SEGMENTS, NullWritable.get()); Writable[] params = new Writable[addresses.length]; for (int i = 0; i < params.length; i++) { params[i] = param; // build param for parallel call } Writable[] results = call(params, addresses); // make parallel call for (int i = 0; i < results.length; i++) { // process results of call Result result = (Result)results[i]; if (result == null) { DistributedSearch.LOG.warning("Client: no segments from: " + addresses[i]); continue; } String[] segments = ((ArrayWritable)result.value).toStrings(); for (int j = 0; j < segments.length; j++) { DistributedSearch.LOG.info("Client: segment "+segments[j]+" at "+addresses[i]); segmentToAddress.put(segments[j], addresses[i]); } } } /** Return the names of segments searched. */ public String[] getSegmentNames() { return (String[])segmentToAddress.keySet().toArray(new String[segmentToAddress.size()]); } public Hits search(Query query, int numHits) throws IOException { long totalHits = 0; Hits[] segmentHits = new Hits[addresses.length]; Param param = new Param(OP_SEARCH, query, new IntWritable(numHits)); Writable[] params = new Writable[addresses.length]; for (int i = 0; i < params.length; i++) { params[i] = param; // build param for parallel call } Writable[] results = call(params, addresses); // make parallel call TreeSet queue = new TreeSet(); // cull top hits from results float minScore = 0.0f; for (int i = 0; i < results.length; i++) { Result result = (Result)results[i]; if (result == null) continue; Hits hits = (Hits)result.value; totalHits += hits.getTotal(); for (int j = 0; j < hits.getLength(); j++) { Hit hit = hits.getHit(j); if (hit.getScore() >= minScore) { queue.add(new Hit(i, hit.getIndexDocNo(), hit.getScore())); if (queue.size() > numHits) { // if hit queue overfull queue.remove(queue.last()); // remove lowest in hit queue minScore = ((Hit)queue.last()).getScore(); // reset minScore } } } } return new Hits(totalHits, (Hit[])queue.toArray(new Hit[queue.size()])); } public String getExplanation(Query query, Hit hit) throws IOException { Param param = new Param(OP_EXPLAIN, query, hit); Result result = (Result)call(param, addresses[hit.getIndexNo()]); return result.value.toString(); } public HitDetails getDetails(Hit hit) throws IOException { Param param = new Param(OP_DETAILS, hit); Result result = (Result)call(param, addresses[hit.getIndexNo()]); return (HitDetails)result.value; } public HitDetails[] getDetails(Hit[] hits) throws IOException { Writable[] params = new Writable[hits.length]; InetSocketAddress[] addrs = new InetSocketAddress[hits.length]; for (int i = 0; i < hits.length; i++) { params[i] = new Param(OP_DETAILS, hits[i]); addrs[i] = addresses[hits[i].getIndexNo()]; } Writable[] writables = call(params, addrs); HitDetails[] results = new HitDetails[writables.length]; for (int i = 0; i < results.length; i++) { results[i] = (HitDetails)((Result)writables[i]).value; } return results; } public String getSummary(HitDetails hit, Query query) throws IOException { Param param = new Param(OP_SUMMARY, hit, query); InetSocketAddress address = (InetSocketAddress)segmentToAddress.get(hit.getValue("segment")); Result result = (Result)call(param, address); return result.value.toString(); } public String[] getSummary(HitDetails[] hits, Query query) throws IOException { Writable[] params = new Writable[hits.length]; InetSocketAddress[] addrs = new InetSocketAddress[hits.length]; for (int i = 0; i < hits.length; i++) { HitDetails hit = hits[i]; params[i] = new Param(OP_SUMMARY, hit, query); addrs[i] = (InetSocketAddress)segmentToAddress.get(hit.getValue("segment")); } Writable[] results = call(params, addrs); String[] strings = new String[results.length]; for (int i = 0; i < results.length; i++) { if (results[i] != null) strings[i] = ((Result)results[i]).value.toString(); } return strings; } public byte[] getContent(HitDetails hit) throws IOException { Param param = new Param(OP_CONTENT, hit); InetSocketAddress address = (InetSocketAddress)segmentToAddress.get(hit.getValue("segment")); Result result = (Result)call(param, address); return ((BytesWritable)result.value).get(); } public String[] getAnchors(HitDetails hit) throws IOException { Param param = new Param(OP_ANCHORS, hit); InetSocketAddress address = (InetSocketAddress)segmentToAddress.get(hit.getValue("segment")); Result result = (Result)call(param, address); return ((ArrayWritable)result.value).toStrings(); } public static void main(String[] args) throws Exception { String usage = "DistributedSearch$Client query <host> <port> ..."; if (args.length == 0) { System.err.println(usage); System.exit(-1); } Query query = Query.parse(args[0]); InetSocketAddress[] addresses = new InetSocketAddress[(args.length-1)/2]; for (int i = 0; i < (args.length-1)/2; i++) { addresses[i] = new InetSocketAddress(args[i*2+1], Integer.parseInt(args[i*2+2])); } Client client = new Client(addresses); //client.setTimeout(Integer.MAX_VALUE); Hits hits = client.search(query, 10); System.out.println("Total hits: " + hits.getTotal()); for (int i = 0; i < hits.getLength(); i++) { System.out.println(" "+i+" "+ client.getDetails(hits.getHit(i))); } } } }